#!/bin/bash
#
# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

set -e

# Only run this script on test machines that run in the lab.
# See autotest/server/hosts/site_host.py for more information.
if [ ! -f /mnt/stateful_partition/.labmachine ]; then
  exit 0
fi

NON_ETHERNET_DRIVERS="cdc_ether"

# Critical messages should be sent to /var/log/messages.  Other messages should
# be sent via echo to be harvested by recover_duts.py.
#
# TODO(tbroch) Relocate this to common hook library if/when there's more than
# one hook.
critical_msg() {
  echo "$@"
  logger -t "$(basename $0)" -- "$@"
}

# Returns the default gateway.
get_default_gateway() {
  local ip_route="$(ip route get 1.0.0.0)"
  echo ${ip_route} | head -n 1 | cut -f 3 -d ' '
}

# Returns 0 if $1 is a non-Ethernet driver, or 1 otherwise.
is_non_ethernet_driver() {
  local driver="$1"
  local non_ethernet_driver

  for non_ethernet_driver in ${NON_ETHERNET_DRIVERS}; do
    if [ "${driver}" = "${non_ethernet_driver}" ]; then
      return 0
    fi
  done
  return 1
}

# Returns 0 if $1 indicates its link is connected, or 1 otherwise.
is_connected() {
  local device="$1"
  ip link show "${device}" | grep -q LOWER_UP
}

# Shows the list of Ethernet interfaces found on the system.
find_ethernet_interfaces() {
  local device_path
  local driver_path
  local device
  local driver

  for device_path in /sys/class/net/eth*; do
    driver_path="${device_path}/device/driver"
    if [ -e "${driver_path}" ]; then
      device=$(basename "${device_path}")
      driver=$(basename $(readlink -f "${driver_path}"))
      if ! is_non_ethernet_driver "${driver}" || is_connected "${device}" ; then
        echo "${device}"
      fi
    fi
  done
}

# Shows the list of USB-Ethernet interfaces found on the system.
find_usb_ethernet_interfaces() {
  for device_path in /sys/class/net/eth*; do
    if readlink -f "${device_path}" | grep -q usb; then
      basename "${device_path}"
    fi
  done
}

# Pings the given ipaddress through all wired ethernet devices
# $1 - IP address to ping.
do_ping() {
  local ip_addr=$1
  for eth in $(find_ethernet_interfaces); do
    ping -q -I ${eth} -c 9 ${ip_addr} && return 0
  done
  return 1
}

# Restart all our ethernet devices and restart shill.
# Return the remote IP address of the first established SSH connection
find_ssh_client() {
  netstat -lanp | awk '/tcp.*:22.*ESTABLISHED.*/ {split($5,a,":"); print a[1]}'
}

# Try to find a connected SSH client (our autotest server) and ping it
ping_controlling_server() {
  local default_gateway="$(get_default_gateway)" || default_gateway=
  if [ -n "${default_gateway}" ]; then
    do_ping ${default_gateway} && return 0
  fi

  local ssh_client="$(find_ssh_client)" || ssh_client=
  if [ -n "${ssh_client}" ]; then
    do_ping ${ssh_client} && return 0
  fi
  return 1
}

reload_usb_ethernet_devices() {
  local eth
  local ret=1
  for eth in $(find_usb_ethernet_interfaces); do
    echo "Reload interface ${eth}"
    reload_network_device "${eth}"
    ret=0
  done
  return $ret
}

toggle_ethernet_interfaces() {
  local eth
  local ret=1
  for eth in $(find_ethernet_interfaces); do
    echo "Bounce interface ${eth}"
    ifconfig "${eth}" down
    ifconfig "${eth}" up
    ret=0
  done
  return $ret
}

restart_connection_manager() {
  initctl stop shill || echo "Shill was not running."
  initctl start shill
}

ensure_connection_manager_is_running() {
  if initctl status shill | grep -q running ; then
    return 1
  fi
  restart_connection_manager
}

recover_network() {
  for recovery_method in \
      ensure_connection_manager_is_running \
      toggle_ethernet_interfaces \
      reload_usb_ethernet_devices \
      restart_connection_manager; do
    critical_msg "Attempting recovery method \"${recovery_method}\""
    # A success return from the recovery method implies that it successfully
    # performed some action that makes it worth re-checking to see whether
    # our connectivity was remediated.  Otherwise, we move on to the next
    # recovery method without delay.
    "${recovery_method}" || continue
    sleep 30
    ifconfig -a

    if ping_controlling_server; then
      critical_msg "Recovery method \"${recovery_method}\" successful"
      return 0
    fi
  done
  return 1
}

TIMEOUT_MINUTES=15
TIMEOUT=$(( TIMEOUT_MINUTES * 60 ))

main() {
  # Attempt to ping our controlling autotest server over ethernet.
  # We guarantee a minimum of 12 minutes network timeout tolerance
  # for tests that disrupt connectivity with the SSH connection from
  # the autotest server.  This timeout is 15 minutes to make sure it
  # can never fail before that SSH session does.

  local endtime=$(( $(date +%s) + TIMEOUT ))
  if ping_controlling_server; then
    return 0
  fi
  if recover_network; then
    return 0
  fi
  critical_msg "Restart failed; will retry recovery for ~$TIMEOUT_MINUTES minutes"
  while [ $(date +%s) -lt $endtime ]; do
    sleep 30
    if ping_controlling_server; then
      critical_msg "Gateway now reachable; ending recovery loop"
      return 0
    fi
    if recover_network; then
      return 0
    fi
  done

  critical_msg "All efforts to recover ethernet have been exhausted. Rebooting."
  sync
  (sleep 5 && reboot) &
  return 1
}

main
